In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import plotly.io as pio
pio.renderers.default='notebook'
In [152]:
%%HTML
<script src="require.js"></script>
In [2]:
meta_df_1000 = pd.read_parquet('meta_df_1000_cleaned.parquet')
reviews_df = pd.read_parquet('reviews_df_clean.parquet')
In [3]:
categories_arr1 = meta_df_1000.category.apply(lambda x: x[1])
categories_arr2 = meta_df_1000.category.apply(lambda x: x[2] if len(x)>=3 else None)
In [4]:
categories_arr1
Out[4]:
158       eBook Readers & Accessories
243            Accessories & Supplies
947            Accessories & Supplies
1336          Computers & Accessories
2327           Accessories & Supplies
                     ...             
780642                     Headphones
781908        Computers & Accessories
782902        Computers & Accessories
783568      Car & Vehicle Electronics
785453        Computers & Accessories
Name: category, Length: 786, dtype: object
In [6]:
import plotly.express as px
fig = px.histogram(categories_arr1, x='category', text_auto=True)
fig.update_layout(
    title_text='Frequencies of 1st sub-categories within Electronics'
)
fig.show()
In [398]:
merged_df = pd.merge(meta_df_1000, reviews_df, on='asin', how='inner')
merged_df['category_1'] = merged_df.category.apply(lambda x: x[1])
merged_df['rating'] = merged_df['rating'].astype(float)
In [145]:
fig = px.histogram(merged_df, x='category_1', y='rating', histfunc='avg', text_auto=True)
fig.update_layout(
    title_text='Mean rating over categories for all subcategories within Electronics'
)
fig.show()
In [146]:
product_mean_rating = merged_df.groupby('asin')\
                        .agg(mean_rating=('rating', 'mean'), count=('asin', 'count'))\
                        .reset_index()
product_mean_rating = product_mean_rating.sort_values('count')
product_mean_rating
Out[146]:
asin mean_rating count
649 B00Y86CJ1A 5.00 1
150 B000Y1NES0 5.00 1
152 B000YZ63MK 2.00 1
490 B00IML19MC 5.00 1
155 B0011FZB52 5.00 1
... ... ... ...
569 B00OBTO8EA 3.90 20
570 B00ODEU0PY 4.40 20
203 B0022NHQB4 3.85 20
229 B0039NM5SK 4.45 20
775 B01HEKL4KI 2.60 20

776 rows × 3 columns

In [409]:
fig = px.scatter(product_mean_rating, x='count', y='mean_rating', trendline="ols")
fig.update_layout(
    title_text='Product Mean rating vs Count of product reviews in the dataset'
)
In [116]:
!jupyter nbconvert --to html visualization.ipynb
[NbConvertApp] Converting notebook visualization.ipynb to html
[NbConvertApp] Writing 819185 bytes to visualization.html
In [117]:
fig.write_html('plot1.html',
                full_html=False,
                include_plotlyjs='cdn')
In [148]:
px.scatter(merged_df, x="rating", y='found_helpful', trendline='ols')
In [149]:
fig = px.density_heatmap(merged_df, x="verified_purchase", y="category_1", z="rating", histfunc="avg", marginal_x="histogram", marginal_y="histogram", text_auto=True)
fig.update_layout(
    title_text='Avg ratings for each category and verified purchase combos'
)
fig.show()
In [150]:
meta_df_1000['rank']
Out[150]:
232       {'Amazon Launchpad ': None, 'Amazon Launchpad ...
738       {'Amazon Launchpad ': None, 'Amazon Launchpad ...
2385      {'Amazon Launchpad ': None, 'Amazon Launchpad ...
2573      {'Amazon Launchpad ': None, 'Amazon Launchpad ...
3269      {'Amazon Launchpad ': None, 'Amazon Launchpad ...
                                ...                        
783492    {'Amazon Launchpad ': None, 'Amazon Launchpad ...
783763    {'Amazon Launchpad ': None, 'Amazon Launchpad ...
784274    {'Amazon Launchpad ': None, 'Amazon Launchpad ...
784963    {'Amazon Launchpad ': None, 'Amazon Launchpad ...
785097    {'Amazon Launchpad ': None, 'Amazon Launchpad ...
Name: rank, Length: 799, dtype: object
In [193]:
rank_df = merged_df[['asin', 'rank', 'rating', 'category_1']].explode('rank')
rank_df = rank_df.dropna(subset='rank').reset_index(drop=True)
rank_df
Out[193]:
asin rank rating category_1
0 1039869017 [Computers & Accessories > Tablet Accessories ... 5.0 Computers & Accessories
1 1039869017 [Computers & Accessories > Tablet Accessories ... 5.0 Computers & Accessories
2 1944288023 [Cell Phones & Accessories , 1,053,995] 5.0 Headphones
3 1944288023 [Cell Phones & Accessories > Cell Phone Access... 5.0 Headphones
4 1944288023 [Electronics > Home Audio & Theater, 153,549] 5.0 Headphones
... ... ... ... ...
10776 B01HEKL4KI [Electronics > Car Electronics > Car Video > O... 3.0 Car & Vehicle Electronics
10777 B01HEKL4KI [Electronics > Car Electronics > Car Audio, 9,... 3.0 Car & Vehicle Electronics
10778 B01HEKL4KI [Electronics , 163,429] 3.0 Car & Vehicle Electronics
10779 B01HEKL4KI [Electronics > Car Electronics > Car Video > O... 3.0 Car & Vehicle Electronics
10780 B01HEKL4KI [Electronics > Car Electronics > Car Audio, 9,... 3.0 Car & Vehicle Electronics

10781 rows × 4 columns

In [200]:
rank_df = pd.concat([rank_df.drop(columns='rank'), pd.DataFrame(rank_df['rank'].to_list(), columns=['category', 'rank'])], axis=1)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [200], line 1
----> 1 rank_df = pd.concat([rank_df.drop(columns='rank'), pd.DataFrame(rank_df['rank'].to_list(), columns=['category', 'rank'])], axis=1)
      2 rank_df['rank'] = rank_df['rank'].str.replace(',', '', regex=False)

File ~/miniforge3/lib/python3.10/site-packages/pandas/core/frame.py:761, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    753         mgr = arrays_to_mgr(
    754             arrays,
    755             columns,
   (...)
    758             typ=manager,
    759         )
    760     else:
--> 761         mgr = ndarray_to_mgr(
    762             data,
    763             index,
    764             columns,
    765             dtype=dtype,
    766             copy=copy,
    767             typ=manager,
    768         )
    769 else:
    770     mgr = dict_to_mgr(
    771         {},
    772         index,
   (...)
    775         typ=manager,
    776     )

File ~/miniforge3/lib/python3.10/site-packages/pandas/core/internals/construction.py:349, in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
    344 # _prep_ndarraylike ensures that values.ndim == 2 at this point
    345 index, columns = _get_axes(
    346     values.shape[0], values.shape[1], index=index, columns=columns
    347 )
--> 349 _check_values_indices_shape_match(values, index, columns)
    351 if typ == "array":
    353     if issubclass(values.dtype.type, str):

File ~/miniforge3/lib/python3.10/site-packages/pandas/core/internals/construction.py:420, in _check_values_indices_shape_match(values, index, columns)
    418 passed = values.shape
    419 implied = (len(index), len(columns))
--> 420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

ValueError: Shape of passed values is (10781, 1), indices imply (10781, 2)
In [202]:
rank_df['rank'] = rank_df['rank'].str.replace(',', '', regex=False).astype(float)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In [202], line 1
----> 1 rank_df['rank'] = rank_df['rank'].str.replace(',', '', regex=False).astype(float)
      2 rank_df

File ~/miniforge3/lib/python3.10/site-packages/pandas/core/generic.py:5902, in NDFrame.__getattr__(self, name)
   5895 if (
   5896     name not in self._internal_names_set
   5897     and name not in self._metadata
   5898     and name not in self._accessors
   5899     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   5900 ):
   5901     return self[name]
-> 5902 return object.__getattribute__(self, name)

File ~/miniforge3/lib/python3.10/site-packages/pandas/core/accessor.py:182, in CachedAccessor.__get__(self, obj, cls)
    179 if obj is None:
    180     # we're accessing the attribute of the class, i.e., Dataset.geo
    181     return self._accessor
--> 182 accessor_obj = self._accessor(obj)
    183 # Replace the property with the accessor object. Inspired by:
    184 # https://www.pydanny.com/cached-property.html
    185 # We need to use object.__setattr__ because we overwrite __setattr__ on
    186 # NDFrame
    187 object.__setattr__(obj, self._name, accessor_obj)

File ~/miniforge3/lib/python3.10/site-packages/pandas/core/strings/accessor.py:181, in StringMethods.__init__(self, data)
    178 def __init__(self, data) -> None:
    179     from pandas.core.arrays.string_ import StringDtype
--> 181     self._inferred_dtype = self._validate(data)
    182     self._is_categorical = is_categorical_dtype(data.dtype)
    183     self._is_string = isinstance(data.dtype, StringDtype)

File ~/miniforge3/lib/python3.10/site-packages/pandas/core/strings/accessor.py:235, in StringMethods._validate(data)
    232 inferred_dtype = lib.infer_dtype(values, skipna=True)
    234 if inferred_dtype not in allowed_types:
--> 235     raise AttributeError("Can only use .str accessor with string values!")
    236 return inferred_dtype

AttributeError: Can only use .str accessor with string values!
In [203]:
rank_df
Out[203]:
asin rating category_1 category rank
0 1039869017 5.0 Computers & Accessories Computers & Accessories > Tablet Accessories >... 151274.0
1 1039869017 5.0 Computers & Accessories Computers & Accessories > Tablet Accessories >... 151274.0
2 1944288023 5.0 Headphones Cell Phones & Accessories 1053995.0
3 1944288023 5.0 Headphones Cell Phones & Accessories > Cell Phone Accesso... 56064.0
4 1944288023 5.0 Headphones Electronics > Home Audio & Theater 153549.0
... ... ... ... ... ...
10776 B01HEKL4KI 3.0 Car & Vehicle Electronics Electronics > Car Electronics > Car Video > On 1532.0
10777 B01HEKL4KI 3.0 Car & Vehicle Electronics Electronics > Car Electronics > Car Audio 9997.0
10778 B01HEKL4KI 3.0 Car & Vehicle Electronics Electronics 163429.0
10779 B01HEKL4KI 3.0 Car & Vehicle Electronics Electronics > Car Electronics > Car Video > On 1532.0
10780 B01HEKL4KI 3.0 Car & Vehicle Electronics Electronics > Car Electronics > Car Audio 9997.0

10781 rows × 5 columns

In [209]:
fig = px.box(rank_df[rank_df['rank'] < 1000000], x="category_1", y="rank", color="rating",
            category_orders={'rating': [5.0, 4.0, 3.0, 2.0, 1.0]})
fig.show()
In [217]:
fig = px.box(rank_df[rank_df['rank'] < 500000], x="rating", y="rank", color='category_1',
            category_orders={'rating': [5.0, 4.0, 3.0, 2.0, 1.0]})
fig.show()
In [279]:
import nltk
import spacy
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib import rcParams
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline
In [234]:
nlp = spacy.load('en_core_web_sm')
Out[234]:
0       (This, cover, was, an, amazing, deal, ,, not, ...
1       (Works, great, ., Has, a, obviously, fake, lea...
2       (This, may, be, a, kids, book, ,, but, I, sure...
3       (Such, a, great, book, !, The, illustrations, ...
4       (Whimsical, ,, poignant, ,, a, breath, of, fre...
                              ...                        
5892    (Its, good, for, they, price, ,, i, just, wish...
5893    (I, like, this, product, ,, but, it, is, hard,...
5894    (This, little, DVR, is, Ok, for, the, money, d...
5895        (Play, back, just, give, picture, not, video)
5896    (Difficult, to, figure, out, for, an, old, per...
Name: tokens, Length: 5897, dtype: object
In [288]:
merged_df['tokens'] = merged_df['content'].apply(lambda x: nlp(x.lower())) 
merged_df.tokens
Out[288]:
0       (this, cover, was, an, amazing, deal, ,, not, ...
1       (works, great, ., has, a, obviously, fake, lea...
2       (this, may, be, a, kids, book, ,, but, i, sure...
3       (such, a, great, book, !, the, illustrations, ...
4       (whimsical, ,, poignant, ,, a, breath, of, fre...
                              ...                        
5892    (its, good, for, they, price, ,, i, just, wish...
5893    (i, like, this, product, ,, but, it, is, hard,...
5894    (this, little, dvr, is, ok, for, the, money, d...
5895        (play, back, just, give, picture, not, video)
5896    (difficult, to, figure, out, for, an, old, per...
Name: tokens, Length: 5897, dtype: object
In [289]:
merged_df['tokens'] = merged_df.tokens.apply(lambda x: [w.lemma_ for w in x if not w.is_stop and not w.is_punct and w.lemma_!=' '])
# for w in merged_df.tokens[0]:
#     print (w, w.lemma_, w.is_stop, w.is_punct)
In [290]:
word_freq_df = merged_df[['category_1', 'tokens']].explode('tokens').reset_index(drop=True)\
            .groupby(['category_1', 'tokens']).agg(count=('category_1', 'count')).reset_index().sort_values('count', ascending=False)
In [291]:
word_freq_df[word_freq_df['tokens']=='one']
Out[291]:
category_1 tokens count
16711 Computers & Accessories one 47
23841 Headphones one 39
7076 Camera & Photo one 28
2297 Accessories & Supplies one 20
10563 Car & Vehicle Electronics one 9
28871 Portable Audio & Video one 6
30281 Security & Surveillance one 2
31974 Television & Video one 2
26672 Home Audio one 2
20935 GPS, Finders & Accessories one 1
In [309]:
tokens_df = merged_df[['category_1', 'tokens']].copy()
tokens_df['tokens'] = tokens_df['tokens'].apply(lambda x: ' '.join(x))
category_tokens = tokens_df.groupby('category_1').tokens.agg(lambda x: ' '.join(x.to_list()))
In [312]:
category_tokens.index
Out[312]:
Index(['Accessories & Supplies', 'Camera & Photo', 'Car & Vehicle Electronics',
       'Computers & Accessories', 'Electronics Warranties',
       'GPS, Finders & Accessories', 'Headphones', 'Home Audio',
       'Portable Audio & Video', 'Security & Surveillance', 'Service Plans',
       'Television & Video', 'Video Projectors',
       'eBook Readers & Accessories'],
      dtype='object', name='category_1')
In [317]:
wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000, min_font_size=2).generate(' '.join(merged_df.content.to_list()))
In [321]:
for cat in category_tokens.index:
    print (cat)
    wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000,
                         width=1000, height=1000).generate(category_tokens[cat])
    rcParams['figure.figsize'] = 50,50
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Accessories & Supplies
Camera & Photo
Car & Vehicle Electronics
Computers & Accessories
Electronics Warranties
GPS, Finders & Accessories
Headphones
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageDraw.py:463, in ImageDraw.text.<locals>.draw_text(ink, stroke_width, stroke_offset)
    462 try:
--> 463     mask, offset = font.getmask2(
    464         text,
    465         mode,
    466         direction=direction,
    467         features=features,
    468         language=language,
    469         stroke_width=stroke_width,
    470         anchor=anchor,
    471         ink=ink,
    472         start=start,
    473         *args,
    474         **kwargs,
    475     )
    476     coord = coord[0] + offset[0], coord[1] + offset[1]

AttributeError: 'TransposedFont' object has no attribute 'getmask2'

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
Cell In [321], line 6
      3 wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000,
      4                      width=1000, height=1000).generate(category_tokens[cat])
      5 rcParams['figure.figsize'] = 30,30
----> 6 plt.imshow(wordcloud)
      7 plt.axis("off")
      8 plt.show()

File ~/miniforge3/lib/python3.10/site-packages/matplotlib/_api/deprecation.py:454, in make_keyword_only.<locals>.wrapper(*args, **kwargs)
    448 if len(args) > name_idx:
    449     warn_deprecated(
    450         since, message="Passing the %(name)s %(obj_type)s "
    451         "positionally is deprecated since Matplotlib %(since)s; the "
    452         "parameter will become keyword-only %(removal)s.",
    453         name=name, obj_type=f"parameter of {func.__name__}()")
--> 454 return func(*args, **kwargs)

File ~/miniforge3/lib/python3.10/site-packages/matplotlib/pyplot.py:2631, in imshow(X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, interpolation_stage, filternorm, filterrad, resample, url, data, **kwargs)
   2625 @_copy_docstring_and_deprecators(Axes.imshow)
   2626 def imshow(
   2627         X, cmap=None, norm=None, aspect=None, interpolation=None,
   2628         alpha=None, vmin=None, vmax=None, origin=None, extent=None, *,
   2629         interpolation_stage=None, filternorm=True, filterrad=4.0,
   2630         resample=None, url=None, data=None, **kwargs):
-> 2631     __ret = gca().imshow(
   2632         X, cmap=cmap, norm=norm, aspect=aspect,
   2633         interpolation=interpolation, alpha=alpha, vmin=vmin,
   2634         vmax=vmax, origin=origin, extent=extent,
   2635         interpolation_stage=interpolation_stage,
   2636         filternorm=filternorm, filterrad=filterrad, resample=resample,
   2637         url=url, **({"data": data} if data is not None else {}),
   2638         **kwargs)
   2639     sci(__ret)
   2640     return __ret

File ~/miniforge3/lib/python3.10/site-packages/matplotlib/_api/deprecation.py:454, in make_keyword_only.<locals>.wrapper(*args, **kwargs)
    448 if len(args) > name_idx:
    449     warn_deprecated(
    450         since, message="Passing the %(name)s %(obj_type)s "
    451         "positionally is deprecated since Matplotlib %(since)s; the "
    452         "parameter will become keyword-only %(removal)s.",
    453         name=name, obj_type=f"parameter of {func.__name__}()")
--> 454 return func(*args, **kwargs)

File ~/miniforge3/lib/python3.10/site-packages/matplotlib/__init__.py:1433, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs)
   1430 @functools.wraps(func)
   1431 def inner(ax, *args, data=None, **kwargs):
   1432     if data is None:
-> 1433         return func(ax, *map(sanitize_sequence, args), **kwargs)
   1435     bound = new_sig.bind(ax, *args, **kwargs)
   1436     auto_label = (bound.arguments.get(label_namer)
   1437                   or bound.kwargs.get(label_namer))

File ~/miniforge3/lib/python3.10/site-packages/matplotlib/axes/_axes.py:5610, in Axes.imshow(self, X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, interpolation_stage, filternorm, filterrad, resample, url, **kwargs)
   5602 self.set_aspect(aspect)
   5603 im = mimage.AxesImage(self, cmap=cmap, norm=norm,
   5604                       interpolation=interpolation, origin=origin,
   5605                       extent=extent, filternorm=filternorm,
   5606                       filterrad=filterrad, resample=resample,
   5607                       interpolation_stage=interpolation_stage,
   5608                       **kwargs)
-> 5610 im.set_data(X)
   5611 im.set_alpha(alpha)
   5612 if im.get_clip_path() is None:
   5613     # image does not already have clipping set, clip to axes patch

File ~/miniforge3/lib/python3.10/site-packages/matplotlib/image.py:697, in _ImageBase.set_data(self, A)
    695 if isinstance(A, PIL.Image.Image):
    696     A = pil_to_array(A)  # Needed e.g. to apply png palette.
--> 697 self._A = cbook.safe_masked_invalid(A, copy=True)
    699 if (self._A.dtype != np.uint8 and
    700         not np.can_cast(self._A.dtype, float, "same_kind")):
    701     raise TypeError("Image data of dtype {} cannot be converted to "
    702                     "float".format(self._A.dtype))

File ~/miniforge3/lib/python3.10/site-packages/matplotlib/cbook/__init__.py:743, in safe_masked_invalid(x, copy)
    742 def safe_masked_invalid(x, copy=False):
--> 743     x = np.array(x, subok=True, copy=copy)
    744     if not x.dtype.isnative:
    745         # If we have already made a copy, do the byteswap in place, else make a
    746         # copy with the byte order swapped.
    747         x = x.byteswap(inplace=copy).newbyteorder('N')  # Swap to native order.

File ~/miniforge3/lib/python3.10/site-packages/wordcloud/wordcloud.py:747, in WordCloud.__array__(self)
    739 def __array__(self):
    740     """Convert to numpy array.
    741 
    742     Returns
   (...)
    745         Word cloud image as numpy matrix.
    746     """
--> 747     return self.to_array()

File ~/miniforge3/lib/python3.10/site-packages/wordcloud/wordcloud.py:737, in WordCloud.to_array(self)
    729 def to_array(self):
    730     """Convert to numpy array.
    731 
    732     Returns
   (...)
    735         Word cloud image as numpy matrix.
    736     """
--> 737     return np.array(self.to_image())

File ~/miniforge3/lib/python3.10/site-packages/wordcloud/wordcloud.py:666, in WordCloud.to_image(self)
    662     transposed_font = ImageFont.TransposedFont(
    663         font, orientation=orientation)
    664     pos = (int(position[1] * self.scale),
    665            int(position[0] * self.scale))
--> 666     draw.text(pos, word, fill=color, font=transposed_font)
    668 return self._draw_contour(img=img)

File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageDraw.py:520, in ImageDraw.text(self, xy, text, fill, font, anchor, spacing, align, direction, features, language, stroke_width, stroke_fill, embedded_color, *args, **kwargs)
    517     draw_text(ink, 0)
    518 else:
    519     # Only draw normal text
--> 520     draw_text(ink)

File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageDraw.py:479, in ImageDraw.text.<locals>.draw_text(ink, stroke_width, stroke_offset)
    477 except AttributeError:
    478     try:
--> 479         mask = font.getmask(
    480             text,
    481             mode,
    482             direction,
    483             features,
    484             language,
    485             stroke_width,
    486             anchor,
    487             ink,
    488             start=start,
    489             *args,
    490             **kwargs,
    491         )
    492     except TypeError:
    493         mask = font.getmask(text)

File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageFont.py:908, in TransposedFont.getmask(self, text, mode, *args, **kwargs)
    907 def getmask(self, text, mode="", *args, **kwargs):
--> 908     im = self.font.getmask(text, mode, *args, **kwargs)
    909     if self.orientation is not None:
    910         return im.transpose(self.orientation)

File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageFont.py:665, in FreeTypeFont.getmask(self, text, mode, direction, features, language, stroke_width, anchor, ink, start)
    587 def getmask(
    588     self,
    589     text,
   (...)
    597     start=None,
    598 ):
    599     """
    600     Create a bitmap for the text.
    601 
   (...)
    663              :py:mod:`PIL.Image.core` interface module.
    664     """
--> 665     return self.getmask2(
    666         text,
    667         mode,
    668         direction=direction,
    669         features=features,
    670         language=language,
    671         stroke_width=stroke_width,
    672         anchor=anchor,
    673         ink=ink,
    674         start=start,
    675     )[0]

File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageFont.py:778, in FreeTypeFont.getmask2(self, text, mode, fill, direction, features, language, stroke_width, anchor, ink, start, *args, **kwargs)
    776 Image._decompression_bomb_check(size)
    777 im = fill("RGBA" if mode == "RGBA" else "L", size, 0)
--> 778 self.font.render(
    779     text,
    780     im.id,
    781     mode,
    782     direction,
    783     features,
    784     language,
    785     stroke_width,
    786     ink,
    787     start[0],
    788     start[1],
    789 )
    790 return im, offset

KeyboardInterrupt: 
In [325]:
tokens_df = merged_df[['rating', 'tokens']].copy()
tokens_df['tokens'] = tokens_df['tokens'].apply(lambda x: ' '.join(x))
rating_tokens = tokens_df.groupby('rating').tokens.agg(lambda x: ' '.join(x.to_list()))
rating_tokens
Out[325]:
rating
1.0    recently buy wife acura 3.2 tl navigation syst...
2.0    product lack want ill stick laptop like compac...
3.0    hi clock setting brightness number big light f...
4.0    buy streetpilot move new city want lose soon b...
5.0    cover amazing deal fit perfect protect cheaply...
Name: tokens, dtype: object
In [326]:
for rat in rating_tokens.index:
    print (rat)
    wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000,
                         width=1000, height=1000).generate(rating_tokens[rat])
    rcParams['figure.figsize'] = 50,50
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
1.0
2.0
3.0
4.0
5.0
In [330]:
stopwords = STOPWORDS.update(['work', 'time', 'buy', 'product','use', 'camera'])
for cat in category_tokens.index:
    print (cat)
    
    wordcloud = WordCloud(stopwords=STOPWORDS, relative_scaling=0.3, background_color="white", max_words=1000,
                         width=1000, height=1000).generate(category_tokens[cat])
    rcParams['figure.figsize'] = 50,50
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
1.0
2.0
3.0
4.0
5.0
In [ ]:
#most frequent words for each rating
In [ ]:
# common words in each category
In [ ]:
#wordmap
In [162]:
df = pd.DataFrame({
    "a" : [1,2,3],
    "b" : [[1,2],[2,3,4],[5]],
    "c" : [5,6,7]
})
df
Out[162]:
a b c
0 1 [1, 2] 5
1 2 [2, 3, 4] 6
2 3 [5] 7
In [167]:
df.b.str.len()
Out[167]:
0    2
1    3
2    1
Name: b, dtype: int64
In [350]:
freq_brands_15 = merged_df.brand.value_counts().sort_values(ascending=False)[:15].reset_index().rename(columns={'index': 'brand_name', 'brand': 'counts'})
freq_brands_15
Out[350]:
brand_name counts
0 Sony 161
1 Panasonic 89
2 Samsung 76
3 Canon 72
4 Dell 56
5 HP 55
6 JVC 49
7 StarTech 48
8 Acer 47
9 Lenovo 47
10 Koss 44
11 uxcell 43
12 Kodak 41
13 JJC 40
14 Lexar 40
In [352]:
px.bar(freq_brands_15, x='brand_name', y='counts', text_auto=True)
In [356]:
brand_rating_15 = merged_df.loc[merged_df.brand.isin(freq_brands_15['brand_name']), ['brand', 'rating']]
In [360]:
px.box(brand_rating_15, x='brand', y='rating')
In [365]:
brand_rating_freq = brand_rating_15.groupby(['brand', 'rating']).agg(count=('brand', 'count')).reset_index()
brand_rating_freq
Out[365]:
brand rating count
0 Acer 1.0 6
1 Acer 2.0 4
2 Acer 3.0 6
3 Acer 4.0 17
4 Acer 5.0 14
... ... ... ...
69 uxcell 1.0 5
70 uxcell 2.0 1
71 uxcell 3.0 8
72 uxcell 4.0 16
73 uxcell 5.0 13

74 rows × 3 columns

In [366]:
px.scatter(brand_rating_freq, x='brand', y='rating', size='count')
In [390]:
category_verified_pur = merged_df[['category_1', 'verified_purchase']].value_counts().rename('counts').reset_index()
category_verified_pur
Out[390]:
category_1 verified_purchase counts
0 Computers & Accessories True 2074
1 Camera & Photo True 1081
2 Accessories & Supplies True 939
3 Headphones True 290
4 Car & Vehicle Electronics True 273
5 Computers & Accessories False 249
6 Portable Audio & Video True 234
7 Home Audio True 155
8 Camera & Photo False 123
9 Security & Surveillance True 78
10 GPS, Finders & Accessories True 76
11 Television & Video False 55
12 Portable Audio & Video False 41
13 Television & Video True 38
14 Accessories & Supplies False 32
15 Headphones False 29
16 Home Audio False 26
17 GPS, Finders & Accessories False 18
18 Car & Vehicle Electronics False 17
19 Video Projectors False 15
20 Service Plans True 13
21 Electronics Warranties False 13
22 Security & Surveillance False 8
23 Service Plans False 7
24 Video Projectors True 6
25 Electronics Warranties True 4
26 eBook Readers & Accessories True 3
In [392]:
# merged_df
fig = px.sunburst(
    category_verified_pur,
    path=['category_1', 'verified_purchase'],
    values='counts'
)

fig.show()
In [411]:
merged_df
Out[411]:
category description title_x also_buy brand feature rank also_view main_cat date_x ... asin title_y content date_y author rating found_helpful verified_purchase product category_1
0 [Electronics, Computers & Accessories, Tablet ... Brand new and high quality\nLightweight soft c... PU Leather 360 Degree Rotating Stand Case Cove... [] new brand Brand new and high quality Lightweight soft ca... [[Computers & Accessories > Tablet Accessories... [] Computers 2012-11-23 ... 1039869017 love it!! This cover was an amazing deal, not only does ... 2013-03-08 lorena mahoney 5.0 1 True Yellow Protective Silicone Gel Slim Thin Back ... Computers & Accessories
1 [Electronics, Computers & Accessories, Tablet ... Brand new and high quality\nLightweight soft c... PU Leather 360 Degree Rotating Stand Case Cove... [] new brand Brand new and high quality Lightweight soft ca... [[Computers & Accessories > Tablet Accessories... [] Computers 2012-11-23 ... 1039869017 Works fine Works great. Has a obviously fake leather exte... 2013-09-05 Alex P 5.0 0 True Yellow Protective Silicone Gel Slim Thin Back ... Computers & Accessories
2 [Electronics, Headphones] This beautifully illustrated children's book t... The Legend of the Starfish [] The Joy Market [[Cell Phones & Accessories , 1,053,995], [Cel... [] Cell Phones & Accessories NaT ... 1944288023 but I sure enjoyed it myself This may be a kids book, but I sure enjoyed it... 2016-04-22 RedCurlz 5.0 0 False The Legend of the Starfish Headphones
3 [Electronics, Headphones] This beautifully illustrated children's book t... The Legend of the Starfish [] The Joy Market [[Cell Phones & Accessories , 1,053,995], [Cel... [] Cell Phones & Accessories NaT ... 1944288023 Such a great book! The illustrations are beaut... Such a great book! The illustrations are beaut... 2016-04-19 Micah Wood 5.0 1 False The Legend of the Starfish Headphones
4 [Electronics, Headphones] This beautifully illustrated children's book t... The Legend of the Starfish [] The Joy Market [[Cell Phones & Accessories , 1,053,995], [Cel... [] Cell Phones & Accessories NaT ... 1944288023 Add to your library! Whimsical, poignant, a breath of fresh air. A ... 2016-04-19 KAT 5.0 1 False The Legend of the Starfish Headphones
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5892 [Electronics, Car & Vehicle Electronics, Car E... Feature : G-sensor function and motion detecti... Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... [] Lecmal 3\n3\n3\n3\n3 [[Electronics , 163,429], [Electronics > Car E... [] Car Electronics 2016-06-22 ... B01HEKL4KI Its good for they price Its good for they price, i just wish it had a ... 2016-08-29 Remo 4.0 0 True Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... Car & Vehicle Electronics
5893 [Electronics, Car & Vehicle Electronics, Car E... Feature : G-sensor function and motion detecti... Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... [] Lecmal 3\n3\n3\n3\n3 [[Electronics , 163,429], [Electronics > Car E... [] Car Electronics 2016-06-22 ... B01HEKL4KI I like this product I like this product , but it is hard to use af... 2017-11-25 Donna B 4.0 0 True Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... Car & Vehicle Electronics
5894 [Electronics, Car & Vehicle Electronics, Car E... Feature : G-sensor function and motion detecti... Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... [] Lecmal 3\n3\n3\n3\n3 [[Electronics , 163,429], [Electronics > Car E... [] Car Electronics 2016-06-22 ... B01HEKL4KI ... DVR is Ok for the money daytime it works g... This little DVR is Ok for the money daytime it... 2016-08-20 Hawk 4.0 2 True Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... Car & Vehicle Electronics
5895 [Electronics, Car & Vehicle Electronics, Car E... Feature : G-sensor function and motion detecti... Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... [] Lecmal 3\n3\n3\n3\n3 [[Electronics , 163,429], [Electronics > Car E... [] Car Electronics 2016-06-22 ... B01HEKL4KI Three Stars Play back just give picture not video 2017-01-10 Fred Maragheh 3.0 0 True Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... Car & Vehicle Electronics
5896 [Electronics, Car & Vehicle Electronics, Car E... Feature : G-sensor function and motion detecti... Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... [] Lecmal 3\n3\n3\n3\n3 [[Electronics , 163,429], [Electronics > Car E... [] Car Electronics 2016-06-22 ... B01HEKL4KI Three Stars Difficult to figure out for an old person. 2016-08-30 bamatutz 3.0 0 True Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... Car & Vehicle Electronics

5897 rows × 21 columns

In [413]:
merged_df.found_helpful.value_counts()
Out[413]:
0      3570
1       947
2       420
3       234
4       133
       ... 
89        1
118       1
75        1
65        1
275       1
Name: found_helpful, Length: 75, dtype: int64
In [419]:
px.scatter(merged_df, color='rating', y='found_helpful', x='category_1')
In [443]:
helpful_votes = merged_df.loc[merged_df.found_helpful != 0, ['category_1', 'rating', 'found_helpful']].copy()
In [449]:
helpful_votes['found_helpful_bins'] = pd.cut(helpful_votes.found_helpful, [0,1,2,3,4,6,11,399])
In [455]:
helpful_votes = helpful_votes.drop(columns='found_helpful').value_counts().rename('count').reset_index()
In [465]:
helpful_votes['found_helpful_bins'] = helpful_votes['found_helpful_bins'].astype(str)
helpful_votes['rating'] = helpful_votes['rating'].astype(str)

helpful_votes
Out[465]:
category_1 rating found_helpful_bins count raring
0 Computers & Accessories 5.0 (0, 1] 163 5.0
1 Accessories & Supplies 5.0 (0, 1] 106 5.0
2 Computers & Accessories 4.0 (0, 1] 88 4.0
3 Camera & Photo 5.0 (0, 1] 82 5.0
4 Computers & Accessories 5.0 (1, 2] 77 5.0
... ... ... ... ... ...
290 Home Audio 1.0 (2, 3] 1 1.0
291 Headphones 4.0 (3, 4] 1 4.0
292 Headphones 3.0 (4, 6] 1 3.0
293 Headphones 2.0 (11, 399] 1 2.0
294 Video Projectors 5.0 (11, 399] 1 5.0

295 rows × 5 columns

In [469]:
px.scatter(helpful_votes, x='rating', y='found_helpful_bins', color='category_1', size='count',
          category_orders={'rating': [1.0, 2.0, 3.0, 4.0, 5.0],
                            'found_helpful_bins': ['(0, 1]', '(1, 2]', '(2, 3]', '(3, 4]', '(4, 6]', '(6, 11]', '(11, 399]'][::-1]})
In [459]:
helpful_votes
Out[459]:
category_1 rating found_helpful_bins count
0 Computers & Accessories 5.0 (0, 1] 163
1 Accessories & Supplies 5.0 (0, 1] 106
2 Computers & Accessories 4.0 (0, 1] 88
3 Camera & Photo 5.0 (0, 1] 82
4 Computers & Accessories 5.0 (1, 2] 77
... ... ... ... ...
290 Home Audio 1.0 (2, 3] 1
291 Headphones 4.0 (3, 4] 1
292 Headphones 3.0 (4, 6] 1
293 Headphones 2.0 (11, 399] 1
294 Video Projectors 5.0 (11, 399] 1

295 rows × 4 columns

In [ ]: